# librerías necesarias para implementar las funciones
library(readxl)
library(glue)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggmosaic)
library(ggridges)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following object is masked from 'package:tidyr':
##
## extract
##
## The following objects are masked from 'package:data.table':
##
## first, last
##
## The following objects are masked from 'package:dplyr':
##
## first, last
library(xtable)
library(here)
## here() starts at /Users/sofiabocker/Desktop/universidad/UCR/Actuariales/Cuarto año/I Ciclo/Estadística Actuarial I/Proyecto/cod
library(skimr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
# importar base de datos
base_datos <- read_excel("/Users/sofiabocker/Desktop/universidad/UCR/Actuariales/Cuarto año/I Ciclo/Estadística Actuarial I/Proyecto/base de datos/base_datos_alcohol.xlsx")
## New names:
## • `` -> `...32`
## • `` -> `...33`
base_datos <- base_datos [, -32]
base_datos <- base_datos [, -32]
base_datos <- head(base_datos, -25)
# Comprimir las variables de 5 categorías en variables de tres categorías <
base_datos_clean <- base_datos %>%
clean_names() %>%
mutate(alcohol_weekdays = fct_collapse(
alcohol_weekdays,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
# Asegurarse que los datos se mantengan como characters
base_datos_clean$alcohol_weekdays <- as.character(base_datos_clean$alcohol_weekdays)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(alcohol_weekends = fct_collapse(
alcohol_weekends,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$alcohol_weekends <- as.character(base_datos_clean$alcohol_weekends)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(health_status = fct_collapse(
health_status ,
Poor = c("Poor", "Very Poor"),
Good = c("Very Good", "Good"),
Fair = "Fair"
))
base_datos_clean$health_status <- as.character(base_datos_clean$health_status)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(good_family_relationship = fct_collapse(
good_family_relationship,
Poor = c("Poor", "Very Poor"),
Good = c("Excellent", "Good"),
Fair = "Fair"
))
base_datos_clean$good_family_relationship <- as.character(base_datos_clean$good_family_relationship)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(free_time_after_school = fct_collapse(
free_time_after_school,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$free_time_after_school <- as.character(base_datos_clean$free_time_after_school)
base_datos_clean <- base_datos_clean %>%
clean_names() %>%
mutate(time_with_friends = fct_collapse(
time_with_friends,
Low = c("Low", "Very Low"),
High = c("High", "Very High"),
Moderate = "Moderate"
))
base_datos_clean$time_with_friends <- as.character(base_datos_clean$time_with_friends)
head(base_datos_clean) # muestra las primeras seis observaciones
## # A tibble: 6 × 31
## school gender age housing_type family_size parental_status mother_education
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Gabrie… Female 18 Urban Above 3 Separated Higher Education
## 2 Gabrie… Female 17 Urban Above 3 Living Together Primary School
## 3 Gabrie… Female 15 Urban Up to 3 Living Together Primary School
## 4 Gabrie… Female 15 Urban Above 3 Living Together Higher Education
## 5 Gabrie… Female 16 Urban Above 3 Living Together High School
## 6 Gabrie… Male 16 Urban Up to 3 Living Together Higher Education
## # ℹ 24 more variables: father_education <chr>, mother_work <chr>,
## # father_work <chr>, reason_school_choice <chr>, legal_responsibility <chr>,
## # commute_time <chr>, weekly_study_time <chr>,
## # extra_educational_support <chr>, parental_educational_support <chr>,
## # private_tutoring <chr>, extracurricular_activities <chr>,
## # attended_daycare <chr>, desire_graduate_education <chr>,
## # has_internet <chr>, is_dating <chr>, good_family_relationship <chr>, …
str <- str(base_datos_clean) # muestra la estructura de los datos
## tibble [649 × 31] (S3: tbl_df/tbl/data.frame)
## $ school : chr [1:649] "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" ...
## $ gender : chr [1:649] "Female" "Female" "Female" "Female" ...
## $ age : num [1:649] 18 17 15 15 16 16 16 17 15 15 ...
## $ housing_type : chr [1:649] "Urban" "Urban" "Urban" "Urban" ...
## $ family_size : chr [1:649] "Above 3" "Above 3" "Up to 3" "Above 3" ...
## $ parental_status : chr [1:649] "Separated" "Living Together" "Living Together" "Living Together" ...
## $ mother_education : chr [1:649] "Higher Education" "Primary School" "Primary School" "Higher Education" ...
## $ father_education : chr [1:649] "Higher Education" "Primary School" "Primary School" "Lower Secondary School" ...
## $ mother_work : chr [1:649] "Homemaker" "Homemaker" "Homemaker" "Health" ...
## $ father_work : chr [1:649] "Teacher" "other" "other" "Services" ...
## $ reason_school_choice : chr [1:649] "Course Preference" "Course Preference" "Other" "Near Home" ...
## $ legal_responsibility : chr [1:649] "Mother" "Father" "Mother" "Mother" ...
## $ commute_time : chr [1:649] "15 to 30 min" "Up to 15 min" "Up to 15 min" "Up to 15 min" ...
## $ weekly_study_time : chr [1:649] "2 to 5h" "2 to 5h" "2 to 5h" "5 to 10h" ...
## $ extra_educational_support : chr [1:649] "Yes" "No" "Yes" "No" ...
## $ parental_educational_support: chr [1:649] "No" "Yes" "No" "Yes" ...
## $ private_tutoring : chr [1:649] "No" "No" "No" "No" ...
## $ extracurricular_activities : chr [1:649] "No" "No" "No" "Yes" ...
## $ attended_daycare : chr [1:649] "Yes" "No" "Yes" "Yes" ...
## $ desire_graduate_education : chr [1:649] "Yes" "Yes" "Yes" "Yes" ...
## $ has_internet : chr [1:649] "No" "Yes" "Yes" "Yes" ...
## $ is_dating : chr [1:649] "No" "No" "No" "Yes" ...
## $ good_family_relationship : chr [1:649] "Good" "Good" "Good" "Fair" ...
## $ free_time_after_school : chr [1:649] "Moderate" "Moderate" "Moderate" "Low" ...
## $ time_with_friends : chr [1:649] "High" "Moderate" "Low" "Low" ...
## $ alcohol_weekdays : chr [1:649] "Low" "Low" "Low" "Low" ...
## $ alcohol_weekends : chr [1:649] "Low" "Low" "Moderate" "Low" ...
## $ health_status : chr [1:649] "Fair" "Fair" "Fair" "Good" ...
## $ school_absence : num [1:649] 4 2 6 0 0 6 0 2 0 0 ...
## $ grade_1st_semester : num [1:649] 0 9 12 14 11 12 13 10 15 12 ...
## $ grade_2nd_semester : num [1:649] 11 11 13 14 13 12 12 13 16 12 ...
# dimensiones de la base de datos
dim(base_datos_clean)
## [1] 649 31
# resumen general de la base de datos
summary(base_datos_clean)
## school gender age housing_type
## Length:649 Length:649 Min. :15.00 Length:649
## Class :character Class :character 1st Qu.:16.00 Class :character
## Mode :character Mode :character Median :17.00 Mode :character
## Mean :16.74
## 3rd Qu.:18.00
## Max. :22.00
## family_size parental_status mother_education father_education
## Length:649 Length:649 Length:649 Length:649
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mother_work father_work reason_school_choice
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## legal_responsibility commute_time weekly_study_time
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extra_educational_support parental_educational_support private_tutoring
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extracurricular_activities attended_daycare desire_graduate_education
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## has_internet is_dating good_family_relationship
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## free_time_after_school time_with_friends alcohol_weekdays
## Length:649 Length:649 Length:649
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## alcohol_weekends health_status school_absence grade_1st_semester
## Length:649 Length:649 Min. : 0.000 Min. : 0.0
## Class :character Class :character 1st Qu.: 0.000 1st Qu.:10.0
## Mode :character Mode :character Median : 2.000 Median :11.0
## Mean : 3.659 Mean :11.4
## 3rd Qu.: 6.000 3rd Qu.:13.0
## Max. :32.000 Max. :19.0
## grade_2nd_semester
## Min. : 0.00
## 1st Qu.:10.00
## Median :11.00
## Mean :11.57
## 3rd Qu.:13.00
## Max. :19.00
# explora data
skimr::skim(base_datos_clean)
| Name | base_datos_clean |
| Number of rows | 649 |
| Number of columns | 31 |
| _______________________ | |
| Column type frequency: | |
| character | 27 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| school | 0 | 1 | 15 | 20 | 0 | 2 | 0 |
| gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| housing_type | 0 | 1 | 5 | 5 | 0 | 2 | 0 |
| family_size | 0 | 1 | 7 | 7 | 0 | 2 | 0 |
| parental_status | 0 | 1 | 9 | 15 | 0 | 2 | 0 |
| mother_education | 0 | 1 | 4 | 22 | 0 | 5 | 0 |
| father_education | 0 | 1 | 4 | 22 | 0 | 5 | 0 |
| mother_work | 0 | 1 | 5 | 9 | 0 | 5 | 0 |
| father_work | 0 | 1 | 5 | 9 | 0 | 5 | 0 |
| reason_school_choice | 0 | 1 | 5 | 17 | 0 | 4 | 0 |
| legal_responsibility | 0 | 1 | 5 | 6 | 0 | 3 | 0 |
| commute_time | 0 | 1 | 12 | 12 | 0 | 4 | 0 |
| weekly_study_time | 0 | 1 | 7 | 13 | 0 | 4 | 0 |
| extra_educational_support | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| parental_educational_support | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| private_tutoring | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| extracurricular_activities | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| attended_daycare | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| desire_graduate_education | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| has_internet | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| is_dating | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| good_family_relationship | 0 | 1 | 4 | 4 | 0 | 3 | 0 |
| free_time_after_school | 0 | 1 | 3 | 8 | 0 | 3 | 0 |
| time_with_friends | 0 | 1 | 3 | 8 | 0 | 3 | 0 |
| alcohol_weekdays | 0 | 1 | 3 | 8 | 0 | 3 | 0 |
| alcohol_weekends | 0 | 1 | 3 | 8 | 0 | 3 | 0 |
| health_status | 0 | 1 | 4 | 4 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 16.74 | 1.22 | 15 | 16 | 17 | 18 | 22 | ▇▅▅▁▁ |
| school_absence | 0 | 1 | 3.66 | 4.64 | 0 | 0 | 2 | 6 | 32 | ▇▂▁▁▁ |
| grade_1st_semester | 0 | 1 | 11.40 | 2.75 | 0 | 10 | 11 | 13 | 19 | ▁▂▇▇▁ |
| grade_2nd_semester | 0 | 1 | 11.57 | 2.91 | 0 | 10 | 11 | 13 | 19 | ▁▁▇▇▂ |
# resumen de la base de datos dado por escuela
by(base_datos_clean, base_datos_clean$school, summary)
## base_datos_clean$school: Gabriel Pereira
## school gender age housing_type
## Length:423 Length:423 Min. :15.00 Length:423
## Class :character Class :character 1st Qu.:16.00 Class :character
## Mode :character Mode :character Median :17.00 Mode :character
## Mean :16.67
## 3rd Qu.:18.00
## Max. :22.00
## family_size parental_status mother_education father_education
## Length:423 Length:423 Length:423 Length:423
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mother_work father_work reason_school_choice
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## legal_responsibility commute_time weekly_study_time
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extra_educational_support parental_educational_support private_tutoring
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extracurricular_activities attended_daycare desire_graduate_education
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## has_internet is_dating good_family_relationship
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## free_time_after_school time_with_friends alcohol_weekdays
## Length:423 Length:423 Length:423
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## alcohol_weekends health_status school_absence grade_1st_semester
## Length:423 Length:423 Min. : 0.000 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.000 1st Qu.:10.00
## Mode :character Mode :character Median : 2.000 Median :12.00
## Mean : 4.215 Mean :11.99
## 3rd Qu.: 6.000 3rd Qu.:14.00
## Max. :32.000 Max. :18.00
## grade_2nd_semester
## Min. : 6.00
## 1st Qu.:10.00
## Median :12.00
## Mean :12.14
## 3rd Qu.:14.00
## Max. :19.00
## ------------------------------------------------------------
## base_datos_clean$school: Mousinho da Silveira
## school gender age housing_type
## Length:226 Length:226 Min. :15.00 Length:226
## Class :character Class :character 1st Qu.:16.00 Class :character
## Mode :character Mode :character Median :17.00 Mode :character
## Mean :16.89
## 3rd Qu.:18.00
## Max. :20.00
## family_size parental_status mother_education father_education
## Length:226 Length:226 Length:226 Length:226
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## mother_work father_work reason_school_choice
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## legal_responsibility commute_time weekly_study_time
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extra_educational_support parental_educational_support private_tutoring
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## extracurricular_activities attended_daycare desire_graduate_education
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## has_internet is_dating good_family_relationship
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## free_time_after_school time_with_friends alcohol_weekdays
## Length:226 Length:226 Length:226
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## alcohol_weekends health_status school_absence grade_1st_semester
## Length:226 Length:226 Min. : 0.000 Min. : 4.0
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 8.0
## Mode :character Mode :character Median : 2.000 Median :10.0
## Mean : 2.619 Mean :10.3
## 3rd Qu.: 4.000 3rd Qu.:12.0
## Max. :12.000 Max. :19.0
## grade_2nd_semester
## Min. : 0.00
## 1st Qu.: 9.00
## Median :10.00
## Mean :10.50
## 3rd Qu.:12.75
## Max. :18.00
# crear un dataframe con sólo las columnas con valores numéricos
base_datos_num <- base_datos_clean %>% select_if(is.numeric)
base_datos_num
## # A tibble: 649 × 4
## age school_absence grade_1st_semester grade_2nd_semester
## <dbl> <dbl> <dbl> <dbl>
## 1 18 4 0 11
## 2 17 2 9 11
## 3 15 6 12 13
## 4 15 0 14 14
## 5 16 0 11 13
## 6 16 6 12 12
## 7 16 0 13 12
## 8 17 2 10 13
## 9 15 0 15 16
## 10 15 0 12 12
## # ℹ 639 more rows
# obtener el rango intercuartil de cada columna numérica
rango_intercuantil <- lapply(base_datos_num, IQR)
rango_intercuantil
## $age
## [1] 2
##
## $school_absence
## [1] 6
##
## $grade_1st_semester
## [1] 3
##
## $grade_2nd_semester
## [1] 3
# obtener la desviación estándar
desviacion_estandar <- lapply(base_datos_num, sd)
desviacion_estandar
## $age
## [1] 1.218138
##
## $school_absence
## [1] 4.640759
##
## $grade_1st_semester
## [1] 2.745265
##
## $grade_2nd_semester
## [1] 2.913639
# obtener la varianza
varianza <- lapply(base_datos_num, var)
varianza
## $age
## [1] 1.483859
##
## $school_absence
## [1] 21.53664
##
## $grade_1st_semester
## [1] 7.536481
##
## $grade_2nd_semester
## [1] 8.48929
# brinda estadísticas más específicas
estadisticas <- stat.desc(base_datos_num)
estadisticas
## age school_absence grade_1st_semester grade_2nd_semester
## nbr.val 6.490000e+02 649.0000000 649.0000000 649.0000000
## nbr.null 0.000000e+00 244.0000000 1.0000000 7.0000000
## nbr.na 0.000000e+00 0.0000000 0.0000000 0.0000000
## min 1.500000e+01 0.0000000 0.0000000 0.0000000
## max 2.200000e+01 32.0000000 19.0000000 19.0000000
## range 7.000000e+00 32.0000000 19.0000000 19.0000000
## sum 1.086700e+04 2375.0000000 7398.0000000 7509.0000000
## median 1.700000e+01 2.0000000 11.0000000 11.0000000
## mean 1.674422e+01 3.6594761 11.3990755 11.5701079
## SE.mean 4.781608e-02 0.1821657 0.1077611 0.1143703
## CI.mean.0.95 9.389318e-02 0.3577064 0.2116031 0.2245812
## var 1.483859e+00 21.5366423 7.5364806 8.4892903
## std.dev 1.218138e+00 4.6407588 2.7452651 2.9136387
## coef.var 7.274973e-02 1.2681484 0.2408323 0.2518247
# obtener el coeficiente de correlación con la columna de Edad
corr_edad <- lapply(base_datos_num, function(x) cor(x, base_datos_num$age))
corr_edad
## $age
## [1] 1
##
## $school_absence
## [1] 0.1499982
##
## $grade_1st_semester
## [1] -0.1743222
##
## $grade_2nd_semester
## [1] -0.1071191
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
column_names = names(corr_edad),
correlation = unlist(corr_edad)
)
# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
geom_bar(stat = "identity") +
labs(title = "Correlación con Edad", x = "Columnas", y = "Correlación") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)
# obtener el coeficiente de correlación con la columna de ausencias
corr_ausencias <- lapply(base_datos_num, function(x) cor(x, base_datos_num$school_absence))
corr_ausencias
## $age
## [1] 0.1499982
##
## $school_absence
## [1] 1
##
## $grade_1st_semester
## [1] -0.1471492
##
## $grade_2nd_semester
## [1] -0.1247449
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
column_names = names(corr_ausencias),
correlation = unlist(corr_ausencias)
)
# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
geom_bar(stat = "identity") +
labs(title = "Correlación con Ausencias", x = "Columnas", y = "Correlación") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)
# obtener el coeficiente de correlación con la columna de notas primer semestre
corr_notas_primer_sem <- lapply(base_datos_num, function(x) cor(x, base_datos_num$grade_1st_semester))
corr_notas_primer_sem
## $age
## [1] -0.1743222
##
## $school_absence
## [1] -0.1471492
##
## $grade_1st_semester
## [1] 1
##
## $grade_2nd_semester
## [1] 0.8649816
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
column_names = names(corr_notas_primer_sem),
correlation = unlist(corr_notas_primer_sem)
)
# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
geom_bar(stat = "identity") +
labs(title = "Correlación con Notas Primer Semestre", x = "Columnas", y = "Correlación") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)
# obtener el coeficiente de correlación con la columna de notas segundo semestre
corr_notas_segundo_sem <- lapply(base_datos_num, function(x) cor(x, base_datos_num$grade_2nd_semester))
corr_notas_segundo_sem
## $age
## [1] -0.1071191
##
## $school_absence
## [1] -0.1247449
##
## $grade_1st_semester
## [1] 0.8649816
##
## $grade_2nd_semester
## [1] 1
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
column_names = names(corr_notas_segundo_sem),
correlation = unlist(corr_notas_segundo_sem)
)
# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
geom_bar(stat = "identity") +
labs(title = "Correlación con Notas Segundo Semestre", x = "Columnas", y = "Correlación") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)
# crea un histograma para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_histogram(binwidth = 1, fill = "blue") +
labs(title = col_name, x = col_name, y = "Frequencia")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
# crear gráficos boxplots para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
ggplot(base_datos_num, aes_string(x = col_name)) +
geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE)
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
# Box Plot de la columna de edad y todas las otras columnas cuantitativas
lapply(names(base_datos_num)[-which(names(base_datos_num) == "age")], function(col_name) {
boxplot(base_datos_num$age ~ base_datos_num[[col_name]], main = paste("Edad y", col_name))
})
## [[1]]
## [[1]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 15 15 15 15.0 15 16 15 15.0 15 18 16 15 16 18
## [2,] 16 16 16 16.5 16 17 16 15.5 16 18 17 15 16 18
## [3,] 17 16 17 18.0 17 17 16 16.0 17 18 17 15 17 18
## [4,] 17 17 17 18.0 18 18 17 16.0 18 18 18 16 19 18
## [5,] 18 18 18 18.0 19 19 18 16.0 20 18 19 16 22 18
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 16.0 17.0 15 17.0 17 17 15 18 17 17
## [2,] 16.5 17.0 16 17.0 17 17 15 18 17 17
## [3,] 17.0 17.5 17 17.0 19 17 15 18 17 17
## [4,] 17.0 18.0 18 17.5 21 17 15 18 17 17
## [5,] 17.0 18.0 19 18.0 21 17 15 18 17 17
##
## [[1]]$n
## [1] 244 12 110 7 93 12 49 3 42 7 21 5 12 1 8 2 10 3 2
## [20] 2 1 1 1 1
##
## [[1]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 16.89885 15.54389 16.84935 17.10422 16.67232 16.54389 15.77429 15.54389
## [2,] 17.10115 16.45611 17.15065 18.89578 17.32768 17.45611 16.22571 16.45611
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
## [1,] 16.5124 18 16.65522 14.2934 15.63168 18 16.72069 16.38277 16.00072
## [2,] 17.4876 18 17.34478 15.7066 18.36832 18 17.27931 18.61723 17.99928
## [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 16.54389 14.53109 17 15 18 17 17
## [2,] 17.45611 23.46891 17 15 18 17 17
##
## [[1]]$out
## [1] 19 19 19 19 20 19 19 21 19 19 19 19 19 19 19 19 19 20 19 19 19 17 15 15 18
## [26] 18
##
## [[1]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 6 7 7 10 10 11 11 12
## [26] 15
##
## [[1]]$names
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
##
##
## [[2]]
## [[2]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 18 18 18 15 15 16 15 15 15 15 15 15 15 15
## [2,] 18 18 18 16 16 17 16 16 16 16 15 16 16 16
## [3,] 18 18 18 17 17 17 17 17 17 16 16 16 17 16
## [4,] 18 18 19 18 18 18 18 18 17 17 17 18 17 18
## [5,] 18 18 19 19 19 19 21 20 18 18 18 20 18 18
## [,15] [,16] [,17]
## [1,] 16.0 17 18
## [2,] 16.0 17 18
## [3,] 17.0 17 18
## [4,] 17.5 18 18
## [5,] 18.0 18 18
##
## [[2]]$n
## [1] 1 2 5 9 33 42 65 95 91 82 72 71 35 22 16 7 1
##
## [[2]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 18 18 17.2934 15.94667 16.44991 16.7562 16.60805 16.67579 16.83437
## [2,] 18 18 18.7066 18.05333 17.55009 17.2438 17.39195 17.32421 17.16563
## [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
## [1,] 15.82552 15.62759 15.62498 16.73293 15.32629 16.4075 16.40282 18
## [2,] 16.17448 16.37241 16.37502 17.26707 16.67371 17.5925 17.59718 18
##
## [[2]]$out
## [1] 16 22 15 15 20 20 19 19 19 19 19 20 20 19 19 15
##
## [[2]]$group
## [1] 3 5 6 6 6 6 9 9 9 9 9 9 9 10 10 16
##
## [[2]]$names
## [1] "0" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"
##
##
## [[3]]
## [[3]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 16.0 16 16.0 15 15 15 15 15 15.0 15 15 15 15 15.0
## [2,] 17.5 17 16.5 16 16 16 16 16 16.0 16 15 16 16 16.0
## [3,] 18.0 18 17.0 17 17 17 17 16 16.5 16 16 17 17 17.5
## [4,] 18.5 18 18.0 18 18 18 18 17 17.0 17 17 17 18 18.0
## [5,] 19.0 18 18.0 18 19 20 21 18 18.0 18 18 18 18 18.0
## [,15] [,16]
## [1,] 17 17
## [2,] 17 17
## [3,] 17 17
## [4,] 18 17
## [5,] 18 17
##
## [[3]]$n
## [1] 7 3 7 16 40 72 83 103 86 80 54 38 25 20 14 1
##
## [[3]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 17.40282 17.08779 16.10422 16.21 16.50036 16.62759 16.65314 15.84432
## [2,] 18.59718 18.91221 17.89578 17.79 17.49964 17.37241 17.34686 16.15568
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 16.32962 15.82335 15.56998 16.74369 16.368 16.7934 16.57773 17
## [2,] 16.67038 16.17665 16.43002 17.25631 17.632 18.2066 17.42227 17
##
## [[3]]$out
## [1] 22 19 19 19 20 19 19 20 19 21 19 19 19 20 20
##
## [[3]]$group
## [1] 5 8 8 8 8 8 9 9 9 9 10 10 10 12 12
##
## [[3]]$names
## [1] "0" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de ausencias y todas las otras columnas cuantitativas
lapply(names(base_datos_num)[-which(names(base_datos_num) == "school_absence")], function(col_name) {
boxplot(base_datos_num$school_absence ~ base_datos_num[[col_name]], main = paste("Ausencia y", col_name))
})
## [[1]]
## [[1]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 0 0 0 0 5 0.0 12
## [2,] 0 0 0 0 0 5 0.0 12
## [3,] 2 2 2 3 4 8 10.5 12
## [4,] 4 4 6 7 6 8 21.0 12
## [5,] 10 10 15 16 12 12 21.0 12
##
## [[1]]$n
## [1] 112 177 179 140 32 6 2 1
##
## [[1]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 1.402816 1.52496 1.291432 2.065259 2.324157 6.064903 -12.9618 12
## [2,] 2.597184 2.47504 2.708568 3.934741 5.675843 9.935097 33.9618 12
##
## [[1]]$out
## [1] 16 24 11 11 11 16 14 14 12 12 12 12 16 16 11 22 32 16 30 21 16 22 18 18 18
## [26] 26 16 16 0
##
## [[1]]$group
## [1] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 4 4 5 5 6
##
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
##
##
## [[2]]
## [[2]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 4 0.0 0 0 0 0 0 0 0.0 0 0 0 0 0.0
## [2,] 4 0.0 0 0 0 0 1 0 0.0 0 0 0 0 0.0
## [3,] 4 4.5 0 2 4 4 4 2 2.0 2 2 2 2 0.5
## [4,] 4 9.0 2 4 8 8 6 7 5.5 6 4 4 5 4.0
## [5,] 4 9.0 2 6 12 16 12 16 10.0 15 10 10 12 10.0
## [,15] [,16] [,17]
## [1,] 0 0 0
## [2,] 0 0 0
## [3,] 0 0 0
## [4,] 2 0 0
## [5,] 4 0 0
##
## [[2]]$n
## [1] 1 2 5 9 33 42 65 95 91 82 72 71 35 22 16 7 1
##
## [[2]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 4 -5.555058 -1.413195 -0.1066667 1.799658 2.049606 3.020126 0.8652679
## [2,] 4 14.555058 1.413195 4.1066667 6.200342 5.950394 4.979874 3.1347321
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
## [1,] 1.08904 0.9531091 1.255181 1.249954 0.6646563 -0.8474285 -0.79 0 0
## [2,] 2.91096 3.0468909 2.744819 2.750046 3.3353437 1.8474285 0.79 0 0
##
## [[2]]$out
## [1] 8 12 26 24 22 16 14 16 14 21 18 18 16 16 16 16 22 12 21 15 18 13 32 30 14
## [26] 6 10
##
## [[2]]$group
## [1] 3 4 5 7 7 7 7 7 7 7 8 8 9 9 9 9 9 11 11 11 11 11 12 12 13
## [26] 15 16
##
## [[2]]$names
## [1] "0" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"
##
##
## [[3]]
## [[3]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 0 0 0.0 0.0 0 0 0 0.0 0 0 0 0 0 0.0
## [2,] 0 4 0.0 0.0 1 2 0 0.5 0 0 0 0 0 0.0
## [3,] 0 8 2.0 4.5 4 4 2 4.0 2 2 2 2 2 0.0
## [4,] 0 8 2.5 11.0 8 8 6 6.0 4 4 4 4 4 2.5
## [5,] 0 8 4.0 22.0 16 16 12 14.0 8 10 9 10 10 6.0
## [,15] [,16]
## [1,] 0 0
## [2,] 0 0
## [3,] 0 0
## [4,] 0 0
## [5,] 0 0
##
## [[3]]$n
## [1] 7 3 7 16 40 72 83 103 86 80 54 38 25 20 14 1
##
## [[3]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 0 4.351146 0.5070403 0.155 2.25126 2.882771 0.9594348 3.143749
## [2,] 0 11.648854 3.4929597 8.845 5.74874 5.117229 3.0405652 4.856251
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 1.318497 1.293403 1.139957 0.9747606 0.736 -0.8832469 0 0
## [2,] 2.681503 2.706597 2.860043 3.0252394 3.264 0.8832469 0 0
##
## [[3]]$out
## [1] 24 26 16 18 21 16 16 16 22 16 15 12 16 12 18 12 32 21 18 11 14 13 30 10 10
## [26] 2 10 4
##
## [[3]]$group
## [1] 5 5 7 7 7 8 8 8 8 8 9 9 9 9 9 10 10 10 10 10 11 11 12 14 14
## [26] 15 15 15
##
## [[3]]$names
## [1] "0" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de nota primer semestre y todas las otras columnas cuantitativas
lapply(names(base_datos_num)[-which(names(base_datos_num) == "grade_1st_semester")], function(col_name) {
boxplot(base_datos_num$grade_1st_semester ~ base_datos_num[[col_name]], main = paste("Nota Primer Semestre y", col_name))
})
## [[1]]
## [[1]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 6 5 6 4 5.0 8.0 9 7
## [2,] 10 10 10 9 7.5 8.0 9 7
## [3,] 12 12 11 11 9.0 10.5 9 7
## [4,] 13 14 13 14 10.5 11.0 9 7
## [5,] 16 17 17 19 14.0 14.0 9 7
##
## [[1]]$n
## [1] 112 177 179 140 32 6 2 1
##
## [[1]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 11.55211 11.52496 10.64572 10.33233 8.162078 8.564903 9 7
## [2,] 12.44789 12.47504 11.35428 11.66767 9.837922 12.435097 9 7
##
## [[1]]$out
## [1] 18 18 18 18 0
##
## [[1]]$group
## [1] 1 3 3 3 4
##
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
##
##
## [[2]]
## [[2]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 4 9.0 5.0 7.0 6 8.0 6 7.0 5 4.0 7 9 6.0 13
## [2,] 10 10.5 10.0 8.0 9 9.0 10 10.5 9 7.0 10 9 7.5 13
## [3,] 12 12.0 11.5 9.0 11 9.5 12 14.0 10 9.0 12 10 8.5 13
## [4,] 14 13.0 14.0 10.5 13 12.5 13 14.5 12 10.5 12 12 12.0 13
## [5,] 19 16.0 17.0 11.0 17 14.0 17 15.0 15 13.0 14 12 15.0 13
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 8.0 12.0 8 10.0 9 9 9 7 14 14
## [2,] 8.5 12.0 9 10.0 9 9 9 7 14 14
## [3,] 9.5 12.5 10 10.0 11 10 9 7 14 14
## [4,] 12.0 13.0 11 11.5 13 11 9 7 14 14
## [5,] 15.0 13.0 11 13.0 13 11 9 7 14 14
##
## [[2]]$n
## [1] 244 12 110 7 93 12 49 3 42 7 21 5 12 1 8 2 10 3 2
## [20] 2 1 1 1 1
##
## [[2]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 11.5954 10.85973 10.89741 7.50704 10.34465 7.903627 11.32286 10.35115
## [2,] 12.4046 13.14027 12.10259 10.49296 11.65535 11.096373 12.67714 17.64885
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 9.268602 6.909856 11.31043 7.880208 6.44752 13 7.54485 11.38277
## [2,] 10.731398 11.090144 12.68957 12.119792 10.55248 13 11.45515 13.61723
## [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 9.00072 8.63168 6.531085 7.765543 9 7 14 14
## [2,] 10.99928 11.36832 15.468915 12.234457 9 7 14 14
##
## [[2]]$out
## [1] 17 0 18 16 16
##
## [[2]]$group
## [1] 4 5 11 11 11
##
## [[2]]$names
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
##
##
## [[3]]
## [[3]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 4.0 5.0 5.0 6.0 4 6 7 9 9 10 13 13 14 13.0
## [2,] 5.0 6.0 6.5 7.0 7 8 9 10 11 12 13 14 15 14.5
## [3,] 7.0 7.0 7.0 7.5 8 9 10 11 12 13 14 14 15 16.0
## [4,] 7.5 7.5 7.5 8.0 9 10 11 11 13 14 14 15 16 17.0
## [5,] 9.0 8.0 8.0 9.0 10 12 13 12 15 15 15 16 17 19.0
## [,15] [,16]
## [1,] 16 18
## [2,] 17 18
## [3,] 17 18
## [4,] 18 18
## [5,] 18 18
##
## [[3]]$n
## [1] 7 3 7 16 40 72 83 103 86 80 54 38 25 20 14 1
##
## [[3]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] 5.50704 5.63168 6.402816 7.105 7.50036 8.62759 9.653145 10.84432 11.65925
## [2,] 8.49296 8.36832 7.597184 7.895 8.49964 9.37241 10.346855 11.15568 12.34075
## [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 12.6467 13.78499 13.74369 14.684 15.11675 16.57773 18
## [2,] 13.3533 14.21501 14.25631 15.316 16.88325 17.42227 18
##
## [[3]]$out
## [1] 0 13 13 13 13 7 14 6 8 8 16 16 12 12 11 12
##
## [[3]]$group
## [1] 8 8 8 8 8 8 8 8 8 10 11 11 12 12 12 13
##
## [[3]]$names
## [1] "0" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de nota segundo semestre y todas las otras columnas cuantitativas
lapply(names(base_datos_num)[-which(names(base_datos_num) == "grade_2nd_semester")], function(col_name) {
boxplot(base_datos_num$grade_2nd_semester ~ base_datos_num[[col_name]], main = paste("Nota Segundo Semestre y", col_name))
})
## [[1]]
## [[1]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 7.0 6 6 5 8.0 9.0 10 8
## [2,] 10.5 10 10 9 8.5 10.0 10 8
## [3,] 12.0 12 12 11 10.0 11.5 11 8
## [4,] 13.0 13 14 14 11.0 15.0 12 8
## [5,] 16.0 17 19 18 13.0 15.0 12 8
##
## [[1]]$n
## [1] 112 177 179 140 32 6 2 1
##
## [[1]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 11.62676 11.64372 11.52762 10.33233 9.301732 8.274839 8.765543 8
## [2,] 12.37324 12.35628 12.47238 11.66767 10.698268 14.725161 13.234457 8
##
## [[1]]$out
## [1] 17 0 5 0 0 0 0 0 0
##
## [[1]]$group
## [1] 1 2 2 3 4 4 4 5 5
##
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
##
##
## [[2]]
## [[2]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 5 9.0 6 6 6 7.0 7 9.0 5 7.0 7 11 7.0 14
## [2,] 10 10.0 10 8 10 9.5 10 11.5 9 8.0 10 11 8.5 14
## [3,] 12 11.5 12 10 11 11.0 12 14.0 11 9.0 11 11 10.0 14
## [4,] 14 13.5 13 12 13 13.5 13 15.0 13 9.5 13 11 11.5 14
## [5,] 19 16.0 17 17 16 17.0 17 16.0 15 10.0 17 11 13.0 14
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 7.0 9.0 7.0 10.0 10.0 7 8 8 15 13
## [2,] 8.5 9.0 9.0 11.0 10.0 7 8 8 15 13
## [3,] 10.0 10.5 10.5 12.0 11.5 9 8 8 15 13
## [4,] 11.0 12.0 11.0 12.5 13.0 11 8 8 15 13
## [5,] 14.0 12.0 12.0 13.0 13.0 11 8 8 15 13
##
## [[2]]$n
## [1] 244 12 110 7 93 12 49 3 42 7 21 5 12 1 8 2 10 3 2
## [20] 2 1 1 1 1
##
## [[2]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 11.5954 9.903627 11.54806 7.611265 10.50848 9.175573 11.32286 10.80725
## [2,] 12.4046 13.096373 12.45194 12.388735 11.49152 12.824427 12.67714 17.19275
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 10.0248 8.104224 9.965647 11 8.63168 14 8.603464 7.148314
## [2,] 11.9752 9.895776 12.034353 11 11.36832 14 11.396536 13.851686
## [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 9.50072 10.63168 8.148314 4.531085 8 8 15 13
## [2,] 11.49928 13.36832 14.851686 13.468915 8 8 15 13
##
## [[2]]$out
## [1] 0 0 0 0 0 0 0 18 18 14 18 10 13
##
## [[2]]$group
## [1] 1 1 1 1 1 1 1 3 5 10 11 12 12
##
## [[2]]$names
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
##
##
## [[3]]
## [[3]]$stats
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 11 0 0 6 5 7 8 8 9 9 10 11 12 14
## [2,] 11 0 0 7 7 8 9 9 10 11 12 13 14 15
## [3,] 11 4 5 8 8 9 10 10 11 12 13 14 15 16
## [4,] 11 8 6 8 9 9 10 11 12 13 14 15 16 17
## [5,] 11 8 8 9 11 10 11 13 15 16 17 17 17 18
## [,15] [,16] [,17]
## [1,] 16.0 18 17
## [2,] 17.0 18 17
## [3,] 17.5 18 17
## [4,] 18.0 18 17
## [5,] 18.0 18 17
##
## [[3]]$n
## [1] 1 2 5 9 33 42 65 95 91 82 72 71 35 22 16 7 1
##
## [[3]]$conf
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 11 -4.93783 0.7604151 7.473333 7.449915 8.756201 9.804025 9.675791
## [2,] 11 12.93783 9.2395849 8.526667 8.550085 9.243799 10.195975 10.324209
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17]
## [1,] 10.66874 11.65104 12.62759 13.62498 14.46586 15.32629 17.105 18 17
## [2,] 11.33126 12.34896 13.37241 14.37502 15.53414 16.67371 17.895 18 17
##
## [[3]]$out
## [1] 11 0 0 6 5 13 11 6 0 7 12 12 7 7 0 17 19
##
## [[3]]$group
## [1] 4 5 5 6 6 6 6 6 6 7 7 7 7 7 7 16 16
##
## [[3]]$names
## [1] "0" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"
# crea un gráfico de densidad para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_density() +
labs(x = col_name)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
# crear gráficos de barra para cada columna cuantitativa
lapply(names(base_datos_num), function(col_name) {
col <- base_datos_num[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_bar(stat = "count", fill = "darkred") +
labs(title = col_name, x = col_name, y = "")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
# crear un dataframe con sólo las columnas de string
base_datos_str <- base_datos_clean %>% select_if(is.character)
base_datos_str
## # A tibble: 649 × 27
## school gender housing_type family_size parental_status mother_education
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Gabriel Per… Female Urban Above 3 Separated Higher Education
## 2 Gabriel Per… Female Urban Above 3 Living Together Primary School
## 3 Gabriel Per… Female Urban Up to 3 Living Together Primary School
## 4 Gabriel Per… Female Urban Above 3 Living Together Higher Education
## 5 Gabriel Per… Female Urban Above 3 Living Together High School
## 6 Gabriel Per… Male Urban Up to 3 Living Together Higher Education
## 7 Gabriel Per… Male Urban Up to 3 Living Together Lower Secondary…
## 8 Gabriel Per… Female Urban Above 3 Separated Higher Education
## 9 Gabriel Per… Male Urban Up to 3 Separated High School
## 10 Gabriel Per… Male Urban Above 3 Living Together High School
## # ℹ 639 more rows
## # ℹ 21 more variables: father_education <chr>, mother_work <chr>,
## # father_work <chr>, reason_school_choice <chr>, legal_responsibility <chr>,
## # commute_time <chr>, weekly_study_time <chr>,
## # extra_educational_support <chr>, parental_educational_support <chr>,
## # private_tutoring <chr>, extracurricular_activities <chr>,
## # attended_daycare <chr>, desire_graduate_education <chr>, …
count_df<- function(column) {
count_data <- base_datos_str %>% count(!!sym(column))
return(count_data)
}
countt <- lapply(names(base_datos_str), function(col) {
count_df(col)
})
print(countt)
## [[1]]
## # A tibble: 2 × 2
## school n
## <chr> <int>
## 1 Gabriel Pereira 423
## 2 Mousinho da Silveira 226
##
## [[2]]
## # A tibble: 2 × 2
## gender n
## <chr> <int>
## 1 Female 383
## 2 Male 266
##
## [[3]]
## # A tibble: 2 × 2
## housing_type n
## <chr> <int>
## 1 Rural 197
## 2 Urban 452
##
## [[4]]
## # A tibble: 2 × 2
## family_size n
## <chr> <int>
## 1 Above 3 457
## 2 Up to 3 192
##
## [[5]]
## # A tibble: 2 × 2
## parental_status n
## <chr> <int>
## 1 Living Together 569
## 2 Separated 80
##
## [[6]]
## # A tibble: 5 × 2
## mother_education n
## <chr> <int>
## 1 High School 139
## 2 Higher Education 175
## 3 Lower Secondary School 186
## 4 None 6
## 5 Primary School 143
##
## [[7]]
## # A tibble: 5 × 2
## father_education n
## <chr> <int>
## 1 High School 131
## 2 Higher Education 128
## 3 Lower Secondary School 209
## 4 None 7
## 5 Primary School 174
##
## [[8]]
## # A tibble: 5 × 2
## mother_work n
## <chr> <int>
## 1 Health 48
## 2 Homemaker 135
## 3 Services 136
## 4 Teacher 72
## 5 other 258
##
## [[9]]
## # A tibble: 5 × 2
## father_work n
## <chr> <int>
## 1 Health 23
## 2 Homemaker 42
## 3 Services 181
## 4 Teacher 36
## 5 other 367
##
## [[10]]
## # A tibble: 4 × 2
## reason_school_choice n
## <chr> <int>
## 1 Course Preference 285
## 2 Near Home 149
## 3 Other 72
## 4 Reputation 143
##
## [[11]]
## # A tibble: 3 × 2
## legal_responsibility n
## <chr> <int>
## 1 Father 153
## 2 Mother 455
## 3 Other 41
##
## [[12]]
## # A tibble: 4 × 2
## commute_time n
## <chr> <int>
## 1 15 to 30 min 213
## 2 30 min to 1h 54
## 3 More than 1h 16
## 4 Up to 15 min 366
##
## [[13]]
## # A tibble: 4 × 2
## weekly_study_time n
## <chr> <int>
## 1 2 to 5h 305
## 2 5 to 10h 97
## 3 More than 10h 35
## 4 Up to 2h 212
##
## [[14]]
## # A tibble: 2 × 2
## extra_educational_support n
## <chr> <int>
## 1 No 581
## 2 Yes 68
##
## [[15]]
## # A tibble: 2 × 2
## parental_educational_support n
## <chr> <int>
## 1 No 251
## 2 Yes 398
##
## [[16]]
## # A tibble: 2 × 2
## private_tutoring n
## <chr> <int>
## 1 No 610
## 2 Yes 39
##
## [[17]]
## # A tibble: 2 × 2
## extracurricular_activities n
## <chr> <int>
## 1 No 334
## 2 Yes 315
##
## [[18]]
## # A tibble: 2 × 2
## attended_daycare n
## <chr> <int>
## 1 No 128
## 2 Yes 521
##
## [[19]]
## # A tibble: 2 × 2
## desire_graduate_education n
## <chr> <int>
## 1 No 69
## 2 Yes 580
##
## [[20]]
## # A tibble: 2 × 2
## has_internet n
## <chr> <int>
## 1 No 151
## 2 Yes 498
##
## [[21]]
## # A tibble: 2 × 2
## is_dating n
## <chr> <int>
## 1 No 410
## 2 Yes 239
##
## [[22]]
## # A tibble: 3 × 2
## good_family_relationship n
## <chr> <int>
## 1 Fair 101
## 2 Good 497
## 3 Poor 51
##
## [[23]]
## # A tibble: 3 × 2
## free_time_after_school n
## <chr> <int>
## 1 High 246
## 2 Low 152
## 3 Moderate 251
##
## [[24]]
## # A tibble: 3 × 2
## time_with_friends n
## <chr> <int>
## 1 High 251
## 2 Low 193
## 3 Moderate 205
##
## [[25]]
## # A tibble: 3 × 2
## alcohol_weekdays n
## <chr> <int>
## 1 High 34
## 2 Low 572
## 3 Moderate 43
##
## [[26]]
## # A tibble: 3 × 2
## alcohol_weekends n
## <chr> <int>
## 1 High 132
## 2 Low 397
## 3 Moderate 120
##
## [[27]]
## # A tibble: 3 × 2
## health_status n
## <chr> <int>
## 1 Fair 124
## 2 Good 357
## 3 Poor 168
# crear gráficos de barra para cada columna cualitativa
lapply(names(base_datos_str), function(col_name) {
col <- base_datos_str[[col_name]]
ggplot(data.frame(col), aes(x = col)) +
geom_bar(stat = "count", fill = "darkred") +
labs(title = col_name, x = col_name, y = "")
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
# Opened polygons
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekdays, group = alcohol_weekdays)) +
geom_density_ridges()
## Picking joint bandwidth of 0.747
# Opened polygons
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekends, group = alcohol_weekends)) +
geom_density_ridges()
## Picking joint bandwidth of 0.822
# Opened polygons
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekdays, group = alcohol_weekdays)) +
geom_density_ridges()
## Picking joint bandwidth of 0.678
# Opened polygons
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekends, group = alcohol_weekends)) +
geom_density_ridges()
## Picking joint bandwidth of 0.856
ggplot(base_datos_clean, aes(x = alcohol_weekdays, y = grade_1st_semester)) +
geom_boxplot()
ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) +
geom_freqpoly(aes(color = `alcohol_weekdays`), binwidth = 1, linewidth = 0.75)
ggplot(base_datos_clean, aes(x = alcohol_weekends, y = grade_1st_semester)) +
geom_boxplot()
ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) +
geom_freqpoly(aes(color = `alcohol_weekends`), binwidth = 1, linewidth = 0.75)
ggplot(base_datos_clean, aes(x = alcohol_weekdays, y = grade_2nd_semester)) +
geom_boxplot()
ggplot(base_datos_clean, aes(x = `grade_2nd_semester`)) +
geom_freqpoly(aes(color = `alcohol_weekdays`), binwidth = 1, linewidth = 0.75)
ggplot(base_datos_clean, aes(x = alcohol_weekends, y = grade_2nd_semester)) +
geom_boxplot()
ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) +
geom_freqpoly(aes(color = `alcohol_weekends`), binwidth = 1, linewidth = 0.75)
lapply(names(base_datos_str), function(col_name) {
col <- base_datos_str[[col_name]]
ggplot(data.frame(col), aes(x = base_datos_str$alcohol_weekdays, y = col)) +
geom_count() +
labs(title = col_name, x = "Alcohol_Weekdays", y = col_name)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
lapply(names(base_datos_str), function(col_name) {
col <- base_datos_str[[col_name]]
ggplot(data.frame(col), aes(x = base_datos_str$alcohol_weekends, y = col)) +
geom_count() +
labs(title = col_name, x = "Alcohol_Weekends", y = col_name)
})
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
base_datos_str |>
count(alcohol_weekdays, gender)
## # A tibble: 6 × 3
## alcohol_weekdays gender n
## <chr> <chr> <int>
## 1 High Female 9
## 2 High Male 25
## 3 Low Female 363
## 4 Low Male 209
## 5 Moderate Female 11
## 6 Moderate Male 32
base_datos_str |>
count(alcohol_weekdays, gender) |>
ggplot(aes(x = alcohol_weekdays, y = gender)) +
geom_tile(aes(fill = n))
# crear un mapa de calor
create_heatmap <- function(col_name) {
count_data <- base_datos_str %>% count(alcohol_weekdays, !!sym(col_name))
ggplot(count_data, aes(x = alcohol_weekdays, y = !!sym(col_name))) +
geom_tile(aes(fill = n), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Comparación de alcohol entre semana con", col_name),
x = "Alcohol entre semana", y = col_name)
}
# aplicar la función a tods las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekdays")], create_heatmap)
print(heatmap_plots)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
# crear un mapa de calor
create_heatmap <- function(col_name) {
count_data <- base_datos_str %>% count(alcohol_weekends, !!sym(col_name))
ggplot(count_data, aes(x = alcohol_weekends, y = !!sym(col_name))) +
geom_tile(aes(fill = n), color = "white") +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Comparación de alcohol en fin de semana con", col_name),
x = "Alcohol en fin de semana", y = col_name)
}
# aplicar la unción a todas las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekends")], create_heatmap)
print(heatmap_plots)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
# Crear tablas de contingencia para cada columna cualitativa y la de cantidad de alcohol entre semana
tablas_contingencias_1 <- lapply(base_datos_str, function(col) {
table(col, base_datos_str$alcohol_weekdays)
})
print(tablas_contingencias_1)
## $school
##
## col High Low Moderate
## Gabriel Pereira 22 379 22
## Mousinho da Silveira 12 193 21
##
## $gender
##
## col High Low Moderate
## Female 9 363 11
## Male 25 209 32
##
## $housing_type
##
## col High Low Moderate
## Rural 10 168 19
## Urban 24 404 24
##
## $family_size
##
## col High Low Moderate
## Above 3 23 408 26
## Up to 3 11 164 17
##
## $parental_status
##
## col High Low Moderate
## Living Together 31 501 37
## Separated 3 71 6
##
## $mother_education
##
## col High Low Moderate
## High School 8 118 13
## Higher Education 9 155 11
## Lower Secondary School 7 173 6
## None 0 5 1
## Primary School 10 121 12
##
## $father_education
##
## col High Low Moderate
## High School 5 117 9
## Higher Education 7 110 11
## Lower Secondary School 11 188 10
## None 0 7 0
## Primary School 11 150 13
##
## $mother_work
##
## col High Low Moderate
## Health 0 45 3
## Homemaker 8 119 8
## other 14 229 15
## Services 9 118 9
## Teacher 3 61 8
##
## $father_work
##
## col High Low Moderate
## Health 1 20 2
## Homemaker 0 39 3
## other 17 329 21
## Services 14 150 17
## Teacher 2 34 0
##
## $reason_school_choice
##
## col High Low Moderate
## Course Preference 13 258 14
## Near Home 10 127 12
## Other 7 56 9
## Reputation 4 131 8
##
## $legal_responsibility
##
## col High Low Moderate
## Father 8 133 12
## Mother 20 408 27
## Other 6 31 4
##
## $commute_time
##
## col High Low Moderate
## 15 to 30 min 11 189 13
## 30 min to 1h 4 42 8
## More than 1h 3 12 1
## Up to 15 min 16 329 21
##
## $weekly_study_time
##
## col High Low Moderate
## 2 to 5h 14 278 13
## 5 to 10h 2 94 1
## More than 10h 2 29 4
## Up to 2h 16 171 25
##
## $extra_educational_support
##
## col High Low Moderate
## No 30 510 41
## Yes 4 62 2
##
## $parental_educational_support
##
## col High Low Moderate
## No 12 215 24
## Yes 22 357 19
##
## $private_tutoring
##
## col High Low Moderate
## No 31 539 40
## Yes 3 33 3
##
## $extracurricular_activities
##
## col High Low Moderate
## No 14 296 24
## Yes 20 276 19
##
## $attended_daycare
##
## col High Low Moderate
## No 10 109 9
## Yes 24 463 34
##
## $desire_graduate_education
##
## col High Low Moderate
## No 8 55 6
## Yes 26 517 37
##
## $has_internet
##
## col High Low Moderate
## No 5 135 11
## Yes 29 437 32
##
## $is_dating
##
## col High Low Moderate
## No 14 364 32
## Yes 20 208 11
##
## $good_family_relationship
##
## col High Low Moderate
## Fair 6 92 3
## Good 24 440 33
## Poor 4 40 7
##
## $free_time_after_school
##
## col High Low Moderate
## High 17 203 26
## Low 7 135 10
## Moderate 10 234 7
##
## $time_with_friends
##
## col High Low Moderate
## High 23 202 26
## Low 4 182 7
## Moderate 7 188 10
##
## $alcohol_weekdays
##
## col High Low Moderate
## High 34 0 0
## Low 0 572 0
## Moderate 0 0 43
##
## $alcohol_weekends
##
## col High Low Moderate
## High 26 74 32
## Low 4 391 2
## Moderate 4 107 9
##
## $health_status
##
## col High Low Moderate
## Fair 9 110 5
## Good 19 310 28
## Poor 6 152 10
# crear una representación gráfica de las tablas de contingencia
lapply(seq_along(tablas_contingencias_1), function(i) {
mosaicplot(tablas_contingencias_1[[i]],
color = TRUE,
xlab = "Alcohol entre semana",
ylab = names(tablas_contingencias_1[[i]])[2],
main = paste("Alcohol entre Semana y", names(base_datos_str)[i][1]))
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_1 <- lapply(tablas_contingencias_1, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_1
## $school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.0191, df = 2, p-value = 0.134
##
##
## $gender
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 39.436, df = 2, p-value = 2.733e-09
##
##
## $housing_type
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.1675, df = 2, p-value = 0.1245
##
##
## $family_size
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.3978, df = 2, p-value = 0.3015
##
##
## $parental_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.49529, df = 2, p-value = 0.7806
##
##
## $mother_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.3106, df = 8, p-value = 0.3168
##
##
## $father_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.1102, df = 8, p-value = 0.847
##
##
## $mother_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.1653, df = 8, p-value = 0.6287
##
##
## $father_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 10.683, df = 8, p-value = 0.2203
##
##
## $reason_school_choice
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 12.356, df = 6, p-value = 0.05448
##
##
## $legal_responsibility
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.6798, df = 4, p-value = 0.04618
##
##
## $commute_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 13.687, df = 6, p-value = 0.03333
##
##
## $weekly_study_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 23.815, df = 6, p-value = 0.0005648
##
##
## $extra_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.696, df = 2, p-value = 0.4283
##
##
## $parental_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 5.7747, df = 2, p-value = 0.05572
##
##
## $private_tutoring
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.60637, df = 2, p-value = 0.7385
##
##
## $extracurricular_activities
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.7848, df = 2, p-value = 0.4097
##
##
## $attended_daycare
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.2162, df = 2, p-value = 0.3302
##
##
## $desire_graduate_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.0739, df = 2, p-value = 0.0291
##
##
## $has_internet
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.5606, df = 2, p-value = 0.4583
##
##
## $is_dating
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.4615, df = 2, p-value = 0.00882
##
##
## $good_family_relationship
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.4854, df = 4, p-value = 0.1124
##
##
## $free_time_after_school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 15.161, df = 4, p-value = 0.004379
##
##
## $time_with_friends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 24.017, df = 4, p-value = 7.925e-05
##
##
## $alcohol_weekdays
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
##
##
## $health_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.2113, df = 4, p-value = 0.3782
# Crear tablas de contingencia para cada columna cualitativa y la de cantidad de alcohol en fines de semana
tablas_contingencias_2 <- lapply(base_datos_str, function(col) {
table(col, base_datos_str$alcohol_weekends)
})
print(tablas_contingencias_2)
## $school
##
## col High Low Moderate
## Gabriel Pereira 87 259 77
## Mousinho da Silveira 45 138 43
##
## $gender
##
## col High Low Moderate
## Female 37 275 71
## Male 95 122 49
##
## $housing_type
##
## col High Low Moderate
## Rural 39 120 38
## Urban 93 277 82
##
## $family_size
##
## col High Low Moderate
## Above 3 85 288 84
## Up to 3 47 109 36
##
## $parental_status
##
## col High Low Moderate
## Living Together 117 341 111
## Separated 15 56 9
##
## $mother_education
##
## col High Low Moderate
## High School 33 75 31
## Higher Education 35 110 30
## Lower Secondary School 28 128 30
## None 2 3 1
## Primary School 34 81 28
##
## $father_education
##
## col High Low Moderate
## High School 30 70 31
## Higher Education 29 80 19
## Lower Secondary School 39 133 37
## None 0 7 0
## Primary School 34 107 33
##
## $mother_work
##
## col High Low Moderate
## Health 10 26 12
## Homemaker 27 84 24
## other 46 168 44
## Services 31 75 30
## Teacher 18 44 10
##
## $father_work
##
## col High Low Moderate
## Health 6 17 0
## Homemaker 7 31 4
## other 71 221 75
## Services 45 99 37
## Teacher 3 29 4
##
## $reason_school_choice
##
## col High Low Moderate
## Course Preference 59 181 45
## Near Home 32 89 28
## Other 19 39 14
## Reputation 22 88 33
##
## $legal_responsibility
##
## col High Low Moderate
## Father 32 94 27
## Mother 92 280 83
## Other 8 23 10
##
## $commute_time
##
## col High Low Moderate
## 15 to 30 min 43 130 40
## 30 min to 1h 12 31 11
## More than 1h 7 9 0
## Up to 15 min 70 227 69
##
## $weekly_study_time
##
## col High Low Moderate
## 2 to 5h 52 193 60
## 5 to 10h 5 72 20
## More than 10h 6 25 4
## Up to 2h 69 107 36
##
## $extra_educational_support
##
## col High Low Moderate
## No 123 351 107
## Yes 9 46 13
##
## $parental_educational_support
##
## col High Low Moderate
## No 64 145 42
## Yes 68 252 78
##
## $private_tutoring
##
## col High Low Moderate
## No 120 375 115
## Yes 12 22 5
##
## $extracurricular_activities
##
## col High Low Moderate
## No 61 210 63
## Yes 71 187 57
##
## $attended_daycare
##
## col High Low Moderate
## No 33 71 24
## Yes 99 326 96
##
## $desire_graduate_education
##
## col High Low Moderate
## No 20 35 14
## Yes 112 362 106
##
## $has_internet
##
## col High Low Moderate
## No 27 101 23
## Yes 105 296 97
##
## $is_dating
##
## col High Low Moderate
## No 87 248 75
## Yes 45 149 45
##
## $good_family_relationship
##
## col High Low Moderate
## Fair 26 55 20
## Good 92 314 91
## Poor 14 28 9
##
## $free_time_after_school
##
## col High Low Moderate
## High 66 126 54
## Low 26 106 20
## Moderate 40 165 46
##
## $time_with_friends
##
## col High Low Moderate
## High 95 106 50
## Low 14 154 25
## Moderate 23 137 45
##
## $alcohol_weekdays
##
## col High Low Moderate
## High 26 4 4
## Low 74 391 107
## Moderate 32 2 9
##
## $alcohol_weekends
##
## col High Low Moderate
## High 132 0 0
## Low 0 397 0
## Moderate 0 0 120
##
## $health_status
##
## col High Low Moderate
## Fair 24 78 22
## Good 85 207 65
## Poor 23 112 33
# crear una representación gráfica de las tablas de contingencia
lapply(seq_along(tablas_contingencias_2), function(i) {
mosaicplot(tablas_contingencias_2[[i]],
color = TRUE,
xlab = "Alcohol Fin de Semana",
ylab = names(tablas_contingencias_2[[i]])[2],
main = paste("Alcohol Fin de Semana y", names(base_datos_str)[i][1]))
})
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
##
## [[13]]
## NULL
##
## [[14]]
## NULL
##
## [[15]]
## NULL
##
## [[16]]
## NULL
##
## [[17]]
## NULL
##
## [[18]]
## NULL
##
## [[19]]
## NULL
##
## [[20]]
## NULL
##
## [[21]]
## NULL
##
## [[22]]
## NULL
##
## [[23]]
## NULL
##
## [[24]]
## NULL
##
## [[25]]
## NULL
##
## [[26]]
## NULL
##
## [[27]]
## NULL
# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_2 <- lapply(tablas_contingencias_2, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_2
## $school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.085819, df = 2, p-value = 0.958
##
##
## $gender
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 69.654, df = 2, p-value = 7.495e-16
##
##
## $housing_type
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.14167, df = 2, p-value = 0.9316
##
##
## $family_size
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.171, df = 2, p-value = 0.2049
##
##
## $parental_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.8628, df = 2, p-value = 0.1449
##
##
## $mother_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 10.255, df = 8, p-value = 0.2476
##
##
## $father_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 9.9856, df = 8, p-value = 0.266
##
##
## $mother_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.0431, df = 8, p-value = 0.532
##
##
## $father_work
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 18.679, df = 8, p-value = 0.01668
##
##
## $reason_school_choice
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.8145, df = 6, p-value = 0.3383
##
##
## $legal_responsibility
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.0722, df = 4, p-value = 0.8987
##
##
## $commute_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 8.0027, df = 6, p-value = 0.2379
##
##
## $weekly_study_time
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 37.497, df = 6, p-value = 1.409e-06
##
##
## $extra_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.4215, df = 2, p-value = 0.298
##
##
## $parental_educational_support
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 6.8137, df = 2, p-value = 0.03314
##
##
## $private_tutoring
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.0945, df = 2, p-value = 0.2128
##
##
## $extracurricular_activities
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1.8354, df = 2, p-value = 0.3994
##
##
## $attended_daycare
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 3.1753, df = 2, p-value = 0.2044
##
##
## $desire_graduate_education
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 4.3507, df = 2, p-value = 0.1136
##
##
## $has_internet
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 2.7657, df = 2, p-value = 0.2509
##
##
## $is_dating
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 0.53281, df = 2, p-value = 0.7661
##
##
## $good_family_relationship
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 5.057, df = 4, p-value = 0.2815
##
##
## $free_time_after_school
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 18.865, df = 4, p-value = 0.0008356
##
##
## $time_with_friends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 94.004, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekdays
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
##
##
## $alcohol_weekends
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
##
##
## $health_status
##
## Pearson's Chi-squared test
##
## data: X[[i]]
## X-squared = 7.4814, df = 4, p-value = 0.1125